Based on data analysis using 2036 data points from five geographically close suburbs, we can conclude that distance from a train station has a small but relatively insignificant effect on townhouse prices. Therefore, new home buyers can choose more conveniently located homes without worrying about a significant increase in price.
# Web Scraping Function for 5 Suburbs
house_scraping<- function( location = "2151/Parramatta/"){
# adapted from https://embracingtherandom.com/r/web-scraping/rent-scraping/
# determine how many pages to scroll through
url <- paste0("https://www.auhouseprices.com/sold/list/NSW/",
location,
"1/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering
webpage <- read_html(url)
# get the number of properties and the number of property displayed on each page
find_page_number <- webpage %>% html_nodes("h2") %>% html_text()
find_page_number <- find_page_number[1]
numbers <- as.numeric(regmatches(find_page_number, gregexpr("[0-9]+", find_page_number))[[1]])
end_page <- ceiling(numbers[3] / numbers[2]) # number of total properties / number on page = total number of pages
df <- NULL
for (thispage in c(1:end_page)){
# get website text
url <- paste0("https://www.auhouseprices.com/sold/list/NSW/",
location,
thispage,
"/?type=townhouse&ymin=0&ymax=0&bmin=0&bmax=0&pmin=0&pmax=0&sort=date&kw=") # type set to townhouse, no other filtering
webpage <- read_html(url)
result <- webpage %>% html_nodes("li") %>% html_text()
# end of the relevant content
result <- result[ 1: grep("current", result) ]
# remove the redundant "listed price"
result <- result[ !grepl("List", result) ]
# remove the price listed with rent
result <- result[ !grepl("Rent", result) ]
# filter information on price and number of bedroom/bathroom/carspace
price_bedroom <- result[ grep("\\$", result)]
price_bedroom <- strsplit( price_bedroom , "\\$")
bedroom <- lapply(price_bedroom, `[`, 1)
bedroom <- strsplit(unlist( trimws( bedroom) ) , "\\s+")
price <- lapply(price_bedroom, `[`, 2)
price <- trimws(price)
price <- as.numeric(gsub(",","", price ))
# filter information on sold month and year
# note sometimes the price is not listed , therefore only get the ones with the price
timesold <- result[ grep("\\$", result)-1]
timesold <- trimws( gsub("Sold on","", timesold ))
# whether to use day month year or just month year
timesold <- lapply(timesold , function(x){
check_format <- strsplit(x, "\\s")
if (length(check_format[[1]]) == 3){
x <- dmy(x)
}else if (length(check_format[[1]]) == 2){
x <- my(x)
}else{
x <- as.Date(paste0(x, "-01-01"))
}
x
})
timesold <- do.call("c", timesold)
# get address of these properties
address <- webpage %>% html_nodes("h4") %>% html_text()
# end of the relevant content
address <- address[ 1: grep("Auction History", address) -1 ]
#decide which address contain sold price
sold_info <- grep("Sold on", result) #entry with sold info
price_info <- grep("\\$", result) #entry with price info
contain_price <- sold_info %in% c(price_info-1) #for every sold entry, the immediate next row should be price, if not, then this sold entry does not have price record
address <- address[contain_price] #only record those property that has price recorded
temp_df <- data.frame( address = address,
bedroom = as.numeric( unlist( lapply( bedroom, `[`, 1) ) ) ,
bathroom = as.numeric( unlist( lapply( bedroom, `[`, 2) )) ,
carspace = as.numeric( unlist( lapply( bedroom, `[`, 3) )),
soldprice = price ,
yearsold =timesold )
df <- rbind(df, temp_df)
}
return(df)
}
# suburb name with space need to be joined with "+" sign
df_parramatta <- house_scraping( location = "2150/parramatta/")
df_merrylands <- house_scraping( location = "2160/merrylands/")
df_auburn <- house_scraping( location = "2144/auburn/")
df_eastwood <- house_scraping( location = "2122/eastwood/")
df_granville <- house_scraping( location = "2142/granville/")
# Writing longitude and latitude into dataframe using given address
l_parramatta <- df_parramatta%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
l_merrylands <- df_merrylands%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
l_auburn <- df_auburn%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
l_eastwood <- df_eastwood%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
l_granville <- df_granville%>% geocode(address, method = 'arcgis', lat=latitude, long=longitude)
# Calculating Distance to Train Station
data_distance_between <- function(lat, lon, fixed_lat, fixed_lon) {
dist <- distHaversine(c(lon, lat), c(fixed_lon, fixed_lat))/1000
return(dist)
} # function that returns the distance between two places
# used Google maps for all longitudes and latitudes
parramatta_lat <- -33.8175
parramatta_lon <- 151.0050
l_parramatta_distance <- data.frame(l_parramatta, "distance_to_train_station(km)" = apply(l_parramatta[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], parramatta_lat, parramatta_lon)))
merrylands_lat <- -33.8363
merrylands_lon <- 150.9926
l_merrylands_distance <- data.frame(l_merrylands, "distance_to_train_station(km)" = apply(l_merrylands[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], merrylands_lat, merrylands_lon)))
auburn_lat <- -33.8490
auburn_lon <- 151.0329
l_auburn_distance <- data.frame(l_auburn, "distance_to_train_station(km)" = apply(l_auburn[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], auburn_lat, auburn_lon)))
eastwood_lat <- -33.7899
eastwood_lon <- 151.0821
l_eastwood_distance <- data.frame(l_eastwood, "distance_to_train_station(km)" = apply(l_eastwood[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], eastwood_lat, eastwood_lon)))
granville_lat <- -33.8326
granville_lon <- 151.0120
l_granville_distance <- data.frame(l_granville, "distance_to_train_station(km)" = apply(l_granville[,c("latitude","longitude")], 1, function(x) data_distance_between(x[1], x[2], granville_lat, granville_lon)))
# Classing Distances by 250m Intervals
l_parramatta_distance$distance_class <- cut(l_parramatta_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500,3.000,3.250,3.500,3.750, 4.000))
l_merrylands_distance$distance_class <- cut(l_merrylands_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500, 3.000,3.250,3.500,3.750, 4.000))
l_auburn_distance$distance_class <- cut(l_auburn_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500,3.000,3.250,3.500,3.750, 4.000))
l_eastwood_distance$distance_class <- cut(l_eastwood_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500,3.000,3.250,3.500,3.750, 4.000))
l_granville_distance$distance_class <- cut(l_granville_distance$"distance_to_train_station.km.",breaks=c(0,0.250,0.500,0.750,1.000,1.250,1.500,1.750,2.000,2.250,2.500,3.000,3.250,3.500,3.750, 4.000))
# Combines data sets of all 5 suburbs into one large dataframe
combined_df <-rbind(l_parramatta_distance, l_merrylands_distance, l_auburn_distance, l_eastwood_distance, l_granville_distance)
# Creating a column for Year
combined_df$Year <- as.factor(format(as.Date(combined_df$yearsold), "%Y"))
Data used for the report was scraped from the internet using the following link: https://www.auhouseprices.com/sold/list/NSW/.
A total of 2036 data points from five suburbs were used. The number of points collected from each suburb is shown below:
Auburn: 675
Eastwood: 191
Granville: 308
Merrylands: 468
Parramatta: 394
We used these variables and cleaned the data in the following ways:
A function was created to calculate straight line distance from townhouses to train stations, which inaccurately represents travel distance between the two. Some townhouses are likely closer to stations from neighbouring suburbs instead. The relevance of trains as a mode of transport may differ between different suburbs. Additionally, train stations often coincide with commercial centres which may affect selling price.
A significant assumption was that no amenities close to train stations would increase the price of townhouses (e.g. shops, schools), which may be confounding variables. Another assumption was that all stations, regardless of how major, had an equal effect on selling prices.
Distances from stations were classed into 250 metre intervals to increase the readability of graphical summaries, as the data points produced cluttered scatterplots. A side-by-side boxplot was used to compare whether distance correlated to a change in price. The boxplot suggests there is no correlation between proximity to train stations and selling price. The residual plot illustrates clustering of data points on the bottom-left. Without random scatter, the data is not homoscedastic, hence a linear model is not appropriate and a more complex relationship may exist.
ggplot(combined_df, aes(x=distance_class, y=soldprice/10000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.50))
model <- lm(soldprice ~ distance_to_train_station.km., data = combined_df)
plot(combined_df$distance_to_train_station.km., resid(model), main = "Residual Plot for Sold Price against Distance to Train Station", xlab = "Distance to train station (km)", ylab = "Residuals", cex=0.15)
abline(h=0)
The numerical summary suggested no correlation.
For 0-250m from train station:
combined_df_0.00 <-filter(combined_df, distance_class == "(0,0.25]")
summary(combined_df_0.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 260000 337750 412500 489812 552500 1150000
For 0.75km-1km:
combined_df_0.75 <-filter(combined_df, distance_class == "(0.75,1]")
summary(combined_df_0.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 65000 395000 546500 585952 700750 1950000
For 3.75-4km:
combined_df_3.75 <-filter(combined_df, distance_class == "(3.75,4]")
summary(combined_df_3.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250000 316000 360000 398110 455250 664000
The median selling price for houses between 0 and 250 metres was $412500, it increased to $546500 between 0.75 and 1 kilometre, then decreased to $360000 between 3.75 and 4 kilometres. The fluctuation in median selling price over distance discounts the possibility of a linear correlation. Properties in Sydney within 400 metres of train stations have higher price growth (4.5%) compared to properties between 800 and 1600 metres (0.3%)(Forbes, 2021). Other research suggests the train stations have an insignificant correlation with property prices (r=0.091) (p=0.380)(Berawi et al., 2020). Research suggests that number of rooms and building size was the most significant contributor to property pricing close to stations(Berawi et al., 2020).
The number of confounding variables alongside a more complex trend could account for the lack of correlation observed. Prices seemed to increase with the number of bedrooms, car-spaces and bathrooms. Yet after controlling for them, there was still no correlation. This suggests there are further confounding variables unaccounted for. To account for inflation, a side-by-side boxplot of selling price between 2000 and 2023 in Western Sydney suburbs was plotted. A general increase in townhouse price over the years was observed. Inflation must also be a significant confounding variable that had a substantial effect on selling price. The complex interaction of variables which affect property price could explain the absence of a correlation.
ggplot(combined_df, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price over Years", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.50))
ggplot(combined_df, aes(x = factor(bedroom), y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price for Different Numbers of Bedrooms", x="Number of Bedrooms", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.50))
combined_df_bathroom <- filter(combined_df,!is.na(bathroom))
ggplot(combined_df_bathroom, aes(x = factor(bathroom), y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price for Different Numbers of Bathrooms", x="Number of Bathrooms", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.50))
combined_df_carspace<- filter(combined_df,!is.na(carspace))
ggplot(combined_df_carspace, aes(x = factor(carspace), y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price for Different Numbers of Carspaces", x="Number of Carspaces", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.50))
Berawi, M. A., Miraj, P., Saroji, G., & Sari, M. (2020). Impact of rail transit station proximity to commercial property prices: Utilizing big data in Urban Real Estate. Journal of Big Data, 7(1), 1–17. https://doi.org/10.1186/s40537-020-00348-z
Bowes, D. R., & Ihlanfeldt, K. R. (2001). Identifying the impacts of rail transit stations on residential property values. Journal of Urban Economics, 50(1), 1–25. https://doi.org/10.1006/juec.2001.2214
Forbes, K. (2021, August 12). Does a train station increase the value of a property? Metropole Property Strategists. Retrieved April 10, 2023, from https://metropole.com.au/how-have-train-stations-affected-property-prices-in-sydney/#:~:text=It%20found%20that%20properties%20within,a%20growth%20rate%20of%200.3%25.
When did you team meet (date and time), and what did each team member contribute?
All members of our team met on 11/3, 18/3, 25/3 and 1/4 from approximately 1:30-3:30pm. On 8/4/23, Brandon and Jasmine met separately to discuss coding ideas from 7:30-9:30pm. On the 15/4/23, all team members met together once more for a 2 hour meeting from 9-11pm to finalise the presentation methods, conclusions and graphical outputs to complete the project.
Overall, our group was split into a coding team consisting of Jasmine Gu, Brandon Lu and Byungjun Kim, and a writing team consisting of Gihansa Kottasha Vidhanelage, Yvonnne Zhao and Sakurako Suzuki. Jasmine Gu led coding and incorporated suggestions from all team members into the research ideas and graphical/numerical outputs. Byungjun and Brandon supported this with their own coding ideas. Brandon further created a daily to-do-list and logbook on Github that kept track of progress in the research and what was discussed each day. Yvonne and Gihansa focused on report writing while taking suggestions and edits from other team members. Sakurako focused on the video script and created a slideshow for the presentation. Byungjun did the final video editing by compiling all clips. Jasmine put the final html file together by formatting everything and confirmed all sections were complete and met requirements.
To get an idea of what the data frame looked like:
head(combined_df)
## address bedroom bathroom carspace soldprice
## 1 13/43 Pemberton Street Parramatta 2150 3 2 2 997000
## 2 E6/88-98 Marsden Street Parramatta 2150 3 3 2 810000
## 3 5/15-17 Grandview Street Parramatta 2150 3 3 1 930000
## 4 1/46-48 Pemberton Street Parramatta 2150 4 2 2 1015250
## 5 7/46-48 Morton Street Parramatta 2150 4 2 2 1100000
## 6 5/1 Wandsworth Street Parramatta 2150 5 2 2 1010000
## yearsold latitude longitude distance_to_train_station.km. distance_class
## 1 2023-03-18 -33.80935 151.0202 1.6703801 (1.5,1.75]
## 2 2022-11-03 -33.82028 150.9988 0.6494166 (0.5,0.75]
## 3 2022-11-03 -33.80949 151.0190 1.5695289 (1.5,1.75]
## 4 2022-10-28 -33.81028 151.0210 1.6812388 (1.5,1.75]
## 5 2022-10-08 -33.80927 151.0184 1.5441637 (1.5,1.75]
## 6 2022-09-10 -33.81102 151.0151 1.1824291 (1,1.25]
## Year
## 1 2023
## 2 2022
## 3 2022
## 4 2022
## 5 2022
## 6 2022
address (qualitative): The address of the property
bedroom (quantitative): The number of bedrooms in this property
bathroom (quantitative): The number of bathrooms in this property
carspace (quantitative): The number of carspaces in this property
soldprice (quantitative): The final selling price of the property
yearsold (date): The date on which the property was sold
latitude (quantitative): The latitude of the property
longitude (quantitative): The longitude of the property
distance_to_train_station.km. (quantitative): The distance between the property and the train station of that suburb in kilometres
distance_class (qualitative): The 250m interval class in which the distance between the property and train station lies in
Year (qualitative): The year in which the property was sold
For clients who may visualise more easily with colours (each colour
corresponds to the respective horizontal axis label below it) :
Overall Graph:
ggplot(combined_df, aes(x=distance_class, y=soldprice/10000, fill = distance_class))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.50))+
theme(legend.position = "none")
Confounding Variable Graphs
ggplot(combined_df, aes(x = Year, y = soldprice/100000, fill = Year))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price over Years", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.50))+
theme(legend.position = "none")
ggplot(combined_df, aes(x = factor(bedroom), y = soldprice/100000, fill = factor(bedroom)))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price for Different Numbers of Bedrooms", x="Number of Bedrooms", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.50))+
scale_fill_brewer(palette="RdPu")+
theme(legend.position = "none")
combined_df_bathroom <- filter(combined_df,!is.na(bathroom))
ggplot(combined_df_bathroom, aes(x = factor(bathroom), y = soldprice/100000, fill = factor(bathroom)))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price for Different Numbers of Bathrooms", x="Number of Bathrooms", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.50))+
scale_fill_brewer(palette="PuBu")+
theme(legend.position = "none")
combined_df_carspace<- filter(combined_df,!is.na(carspace))
ggplot(combined_df_carspace, aes(x = factor(carspace), y = soldprice/100000, fill = factor(carspace)))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price for Different Numbers of Carspaces", x="Number of Carspaces", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.50))+
scale_fill_brewer(palette="Greens")+
theme(legend.position = "none")
(Colours determined by number of carspaces) - Clients may use this section to take a closer look at townhouse sale price data that is specific to the number of bedrooms they are interested in.
combined_df_1bed <-filter(combined_df, bedroom ==1)
combined_df_2bed <-filter(combined_df, bedroom ==2)
combined_df_3bed <-filter(combined_df, bedroom ==3)
combined_df_4bed <-filter(combined_df, bedroom ==4)
combined_df_5bed <-filter(combined_df, bedroom ==5)
ggplot(combined_df_1bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.0) +
labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.50))
ggplot(combined_df_1bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.0, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.50))+
scale_fill_brewer(palette="Pastel1")
summary(combined_df_1bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 243000 426500 610000 701000 930000 1250000
ggplot(combined_df_2bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.0) +
labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))+
scale_fill_brewer(palette="Set3")
ggplot(combined_df_2bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.0, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))+
scale_fill_brewer(palette="Pastel1")
summary(combined_df_2bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 185000 345000 422000 454197 535750 1470000
ggplot(combined_df_3bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.0) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))
ggplot(combined_df_3bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.0, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))+
scale_fill_brewer(palette="Pastel1")
summary(combined_df_3bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 410000 536000 574283 685000 2020000
ggplot(combined_df_4bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.0) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))
ggplot(combined_df_4bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.0, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))+
scale_fill_brewer(palette="Pastel1")
summary(combined_df_4bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100000 525000 672500 704376 840000 3000000
ggplot(combined_df_5bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.0) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))
ggplot(combined_df_5bed, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.0, aes(fill=factor(carspace))) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms", x="Distance from Train Station(km)", y="Selling Price (x$100000)", fill = "Number of Carspaces")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))+
theme(plot.title = element_text(hjust=0.25))+
scale_fill_brewer(palette="Pastel1")
summary(combined_df_5bed$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 595000 769004 1010000 1175092 1514000 2090000
combined_df_1bed_1car <-filter(combined_df, bedroom ==1, carspace == 1)
combined_df_2bed_1car <-filter(combined_df, bedroom ==2, carspace == 1)
combined_df_2bed_2car <-filter(combined_df, bedroom ==2, carspace == 2)
combined_df_3bed_1car <-filter(combined_df, bedroom ==3, carspace == 1)
combined_df_3bed_2car <-filter(combined_df, bedroom ==3, carspace == 2)
combined_df_3bed_3car <-filter(combined_df, bedroom ==3, carspace == 3)
combined_df_3bed_4car <-filter(combined_df, bedroom ==3, carspace == 4)
combined_df_4bed_1car <-filter(combined_df, bedroom ==4, carspace == 1)
combined_df_4bed_2car <-filter(combined_df, bedroom ==4, carspace == 2)
combined_df_4bed_3car <-filter(combined_df, bedroom ==4, carspace == 3)
combined_df_4bed_4car <-filter(combined_df, bedroom ==4, carspace == 4)
combined_df_5bed_1car <-filter(combined_df, bedroom ==5, carspace == 1)
combined_df_5bed_2car <-filter(combined_df, bedroom ==5, carspace == 2)
combined_df_5bed_3car <-filter(combined_df, bedroom ==5, carspace == 3)
ggplot(combined_df_1bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 1 Bedroom and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_1bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 243000 334750 426500 426500 518250 610000
ggplot(combined_df_2bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_2bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 230000 343875 419500 455068 530000 1470000
ggplot(combined_df_2bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 2 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_2bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 225000 365000 460200 483116 585500 1120000
ggplot(combined_df_3bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_3bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73000 393000 515000 541211 645375 2020000
ggplot(combined_df_3bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_3bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 425750 595000 604433 700000 1950000
ggplot(combined_df_3bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_3bed_3car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 65000 458750 682500 607625 719750 1150000
ggplot(combined_df_3bed_4car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 3 Bedrooms and 4 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_3bed_4car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 429500 471750 488000 547062 575000 770000
ggplot(combined_df_4bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_4bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 320000 440000 565000 613430 701500 1625000
ggplot(combined_df_4bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_4bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 160000 550000 710000 721376 851000 1950000
ggplot(combined_df_4bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_4bed_3car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100000 606000 781500 904300 832250 3000000
ggplot(combined_df_4bed_4car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 4 Bedrooms and 4 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_4bed_4car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 456000 580250 595000 629000 691250 800000
ggplot(combined_df_5bed_1car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 1 Carspace", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_5bed_1car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 615000 615000 615000 615000 615000 615000
ggplot(combined_df_5bed_2car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 2 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_5bed_2car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 595000 842500 1165000 1231101 1571000 2090000
ggplot(combined_df_5bed_3car, aes(x = distance_class, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs Distance from Train Station for 5 Bedrooms and 3 Carspaces", x="Distance from Train Station(km)", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_5bed_3car$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
# Filtering by year
combined_df_0.00 <-filter(combined_df, distance_class == "(0,0.25]")
combined_df_0.25 <-filter(combined_df, distance_class == "(0.25,0.5]")
combined_df_0.50 <-filter(combined_df, distance_class == "(0.5,0.75]")
combined_df_0.75 <-filter(combined_df, distance_class == "(0.75,1]")
combined_df_1.00 <-filter(combined_df, distance_class == "(1,1.25]")
combined_df_1.25 <-filter(combined_df, distance_class == "(1.25,1.5]")
combined_df_1.50 <-filter(combined_df, distance_class == "(1.5,1.75]")
combined_df_1.75 <-filter(combined_df, distance_class == "(1.75,2]")
combined_df_2.00 <-filter(combined_df, distance_class == "(2,2.25]")
combined_df_2.25 <-filter(combined_df, distance_class == "(2.25,2.5]")
combined_df_2.50 <-filter(combined_df, distance_class == "(2.5,2.75]")
combined_df_2.75 <-filter(combined_df, distance_class == "(2.75,3]")
combined_df_3.00 <-filter(combined_df, distance_class == "(3,3.25]")
combined_df_3.25 <-filter(combined_df, distance_class == "(3.25,3.5]")
combined_df_3.50 <-filter(combined_df, distance_class == "(3.5,3.75]")
combined_df_3.75 <-filter(combined_df, distance_class == "(3.75,4]")
ggplot(combined_df_0.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 0 to 0.25km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_0.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 260000 337750 412500 489812 552500 1150000
ggplot(combined_df_0.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 0.25 to 0.50km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_0.25$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 165000 365000 506000 518858 635000 1545000
ggplot(combined_df_0.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 0.50 to 0.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_0.50$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 185000 395000 535500 569420 685000 2020000
ggplot(combined_df_0.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 0.75 to 1.00km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_0.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 65000 395000 546500 585952 700750 1950000
ggplot(combined_df_1.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 1.00 to 1.25km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_1.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 575 380000 481000 546800 660000 1950000
ggplot(combined_df_1.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 1.25 to 1.50km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_1.25$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100000 395000 490000 541084 647498 3000000
ggplot(combined_df_1.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 1.50 to 1.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_1.50$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 310000 435000 571500 595094 652750 2090000
ggplot(combined_df_1.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 1.75 to 2.00km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_1.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 286000 406000 560000 603858 757000 1515000
ggplot(combined_df_2.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 2.00 to 2.25km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_2.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 160000 400000 491000 499950 605000 800000
ggplot(combined_df_2.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 2.25 to 2.50km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_2.25$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 270000 377000 465000 508040 637500 1230000
ggplot(combined_df_2.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 2.50 to 2.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_2.50$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
ggplot(combined_df_2.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 2.75 to 3.00km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_2.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
##
ggplot(combined_df_3.00, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 3.00 to 3.25km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_3.00$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 305000 484500 710000 706013 773500 1777000
ggplot(combined_df_3.25, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 3.25 to 3.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_3.25$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 555000 555000 555000 555000 555000 555000
ggplot(combined_df_3.50, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 3.50 to 3.75km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_3.50$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250000 262000 274000 274000 286000 298000
ggplot(combined_df_3.75, aes(x = Year, y = soldprice/100000))+
geom_boxplot(outlier.colour = "blue", outlier.size=1.5) +
labs(title = "Sold Price vs year for townhouses 3.75 to 4.00km from train station", x="Year", y="Selling Price (x$100000)")+
theme_bw()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(combined_df_3.75$soldprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250000 316000 360000 398110 455250 664000